Expedia logs of customer behavior

These include what customers searched for, how they interacted with search results (click/book), whether or not the search result was a travel package. Expedia is interested in predicting which hotel group a user is going to book. Expedia has in-house algorithms to form hotel clusters, where similar hotels for a search (based on historical price, customer star ratings, geographical locations relative to city center, etc) are grouped together.

Training/Test Data

Column name Description Data type
date_time Timestamp string
site_name ID of the Expedia point of sale
(i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...)
int
posa_continent ID of continent associated with site_name int
user_location_country The ID of the country the customer is located int
user_location_region The ID of the region the customer is located int
user_location_city The ID of the city the customer is located int
orig_destination_distance Physical distance between a hotel and a customer at the time of search.
A null means the distance could not be calculated
double
user_id ID of user int
is_mobile 1 when a user connected from a mobile device, 0 otherwise tinyint
is_package 1 if the click/booking was generated as a part of a package
(i.e. combined with a flight), 0 otherwise
int
channel ID of a marketing channel int
srch_ci Checkin date string
srch_co Checkout date string
srch_adults_cnt The number of adults specified in the hotel room int
srch_children_cnt The number of (extra occupancy) children specified in the hotel room int
srch_rm_cnt The number of hotel rooms specified in the search int
srch_destination_id ID of the destination where the hotel search was performed int
srch_destination_type_id Type of destination int
hotel_continent Hotel continent int
hotel_country Hotel country int
hotel_market Hotel market int
is_booking 1 if a booking, 0 if a click tinyint
cnt Numer of similar events in the context of the same user session bigint
hotel_cluster ID of a hotel cluster int

Destinations

Column name Description Data type
srch_destination_id ID of the destination where the hotel search was performed int
d1-d149 latent description of search regions double

Data Exploration


In [ ]:
Sys.setlocale("LC_TIME", "en_US.UTF-8")
Sys.setenv(LANG = "en_US.UTF-8")

library(ggplot2)    # Data visualization
library(data.table) # Faster data reading
library(dplyr)      # Data aggregation etc.
library(scales)     # Plot scaling
library(gridExtra)  # Arrange plots
library(corrplot)   # Correlations

In [ ]:
train <- fread("bzcat ../../data/expedia/train500k.csv.bz2", sep = ",", header = TRUE)

In [ ]:
# convert to factors
cols <- colnames(train)
for (i in cols[!cols %in% c("date_time", "orig_destination_distance")]) {
  train[[i]] <- as.factor(train[[i]])
}

# convert dates
train$date_time <- as.POSIXct(train$date_time)
train$date <- as.Date(train$date_time)
train$srch_ci <- as.Date(train$srch_ci)
train$srch_co <- as.Date(train$srch_co)

In [ ]:
str(train)

Booking Analysis


In [ ]:
train.agg <- train %>% group_by(date, is_booking) %>% summarize(count=n())
ggplot(train.agg, aes(x=date, y=count, color=is_booking)) +
    geom_line(size=0.2) + theme(legend.position="top")

In [ ]:
isBook <- ggplot(train, aes(x=is_booking)) + geom_bar()
isMobl <- ggplot(train, aes(x=is_mobile)) + geom_bar()
isPckg <- ggplot(train, aes(x=is_package)) + geom_bar()

grid.arrange(isBook, isMobl, isPckg, nrow=1, ncol=3)

Skewed Distributions


In [ ]:
chnl <- ggplot(train, aes(x=channel)) + geom_bar()
sdti <- ggplot(train, aes(x=srch_destination_type_id)) + geom_bar()
room <- ggplot(train, aes(x=srch_rm_cnt)) + geom_bar()
cnt  <- ggplot(train, aes(x=cnt)) + geom_bar()

grid.arrange(chnl, sdti, room, cnt, nrow=4, ncol=1)

Adults vs. Children


In [ ]:
srch_adlt <- ggplot(train, aes(x=srch_adults_cnt)) + geom_bar(fill="blue")
srch_chld <- ggplot(train, aes(x=srch_children_cnt)) + geom_bar(fill="blue")

grid.arrange(srch_adlt, srch_chld, nrow=2, ncol=1)

Continent


In [ ]:
posa_cont <- ggplot(train, aes(x=posa_continent)) + geom_bar(fill="blue")
hotl_cont <- ggplot(train, aes(x=hotel_continent)) + geom_bar(fill="blue")

grid.arrange(posa_cont, hotl_cont, nrow=2, ncol=1)

In [ ]:
ggplot(train, aes(x=hotel_cluster, fill=hotel_cluster)) + geom_bar() + theme(legend.position="none")

Booking in advance


In [ ]:
bookings <- train[train$is_booking == 1]
ggplot(bookings[sample(nrow(bookings), 10000)], aes(x=date, y=srch_ci)) +
    geom_point(color="blue", alpha=0.1, size=0.4)

Feature Engineering

  • Trip duration
  • Booking ahead
  • Weekdays
  • Weekend stay

Trip Duration


In [ ]:
train$tripDur <- as.numeric(train$srch_co - train$srch_ci)
train$tripDur[train$tripDur < 0] <- NA

srch_tripDur <- ggplot(train[train$is_booking == 0], aes(x=tripDur)) +
                    geom_bar(binwidth = 1) + xlim(0,30) + 
                    ggtitle("Searches") + theme(legend.position="none")
book_tripDur <- ggplot(train[train$is_booking == 1], aes(x=tripDur)) +
                    geom_bar(binwidth = 1) + xlim(0,30) + 
                    ggtitle("Bookings") + theme(legend.position="none")
grid.arrange(srch_tripDur, book_tripDur, nrow=2, ncol=1)

Booking Ahead


In [ ]:
train$bookAhead <- as.numeric(train$srch_ci - train$date)
train$bookAhead[train$bookAhead < 0] <- NA

srch_bookAhead <- ggplot(train[train$is_booking == 0], aes(x=bookAhead)) +
                      geom_bar(binwidth = 7) + xlim(c(0,200)) +
                      ggtitle("Searches") + theme(legend.position="none")
book_bookAhead <- ggplot(train[train$is_booking == 1], aes(x=bookAhead)) +
                      geom_bar(binwidth = 7) + xlim(c(0,200)) +
                      ggtitle("Bookings") + theme(legend.position="none")
grid.arrange(srch_bookAhead, book_bookAhead, nrow=2, ncol=1)

Weekdays


In [ ]:
#train$week <- cut(train$date, "weeks")
train$Year  <- as.numeric(format(train$date_time, "%Y"))
train$Month <- as.numeric(format(train$date_time, "%m"))
train$Hour  <- as.numeric(format(train$date_time, "%H"))

# add weekdays (with ordered levels)
wd <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
train$weekday <- factor(weekdays(train$date, TRUE), levels = wd)
train$weekday_ci <- factor(weekdays(train$srch_ci, TRUE), levels = wd)
train$weekday_co <- factor(weekdays(train$srch_co, TRUE), levels = wd)

In [ ]:
srch_wd <- ggplot(train[train$is_booking == 0], aes(x=weekday, fill=weekday)) +
               geom_bar() + ggtitle("Searches") + theme(legend.position="none") + ylim(0,100000)
book_wd <- ggplot(train[train$is_booking == 1], aes(x=weekday, fill=weekday)) +
               geom_bar() + ggtitle("Bookings") + theme(legend.position="none") + ylim(0,8200)

srch_wd_ci <- ggplot(train[train$is_booking == 0], aes(x=weekday_ci, fill=weekday_ci)) +
               geom_bar() + ggtitle("Searches") + theme(legend.position="none") + ylim(0,100000)
book_wd_ci <- ggplot(train[train$is_booking == 1], aes(x=weekday_ci, fill=weekday_ci)) +
               geom_bar() + ggtitle("Bookings") + theme(legend.position="none") + ylim(0,8200)

srch_wd_co <- ggplot(train[train$is_booking == 0], aes(x=weekday_co, fill=weekday_co)) +
               geom_bar() + ggtitle("Searches") + theme(legend.position="none") + ylim(0,100000)
book_wd_co <- ggplot(train[train$is_booking == 1], aes(x=weekday_co, fill=weekday_co)) +
               geom_bar() + ggtitle("Bookings") + theme(legend.position="none") + ylim(0,8200)

grid.arrange(srch_wd, srch_wd_ci, srch_wd_co, book_wd, book_wd_ci, book_wd_co, nrow=2, ncol=3)

Weekends


In [ ]:
train$withWeekend[train$is_booking == 1] <- 
    mapply(function(x,y) y-x > 6 | Reduce(`|`, weekdays(seq(x, y, by="day"), TRUE) %in% list('Sun', 'Sat')), 
    train[train$is_booking == 1]$srch_ci, train[train$is_booking == 1]$srch_co)
summary(train$withWeekend)

Package Bookings


In [ ]:
ggplot(train[train$is_booking == 1 & train$is_package == 1], aes(x=bookAhead, y=tripDur)) +
  geom_point(size=0.5, alpha=0.2, position="jitter", color="blue") + ggtitle("Package Bookings")

Feature Correlation


In [ ]:
cols <- c('site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city',
          'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
          'srch_destination_id', 'srch_destination_type_id', 'is_booking', 'cnt', 'hotel_continent', 'hotel_country',
          'hotel_market', 'Year', 'Month', 'Hour', 'weekday', 'tripDur', 'bookAhead', 'orig_destination_distance', 'hotel_cluster')
df <- data.table(train[,cols, with=F])

# impute missing values
df[is.na(df$tripDur)]$tripDur <- -1
df[is.na(df$bookAhead)]$bookAhead <- -1
df[is.na(df$orig_destination_distance)]$orig_destination_distance <- mean(df$orig_destination_distance, na.rm = T)

df[] <- lapply(df, as.integer)
corrplot(cor(df, method = "spearman"), order = "AOE")

Save enriched data


In [ ]:
write.csv(train, gzfile("../../data/expedia/train_feat.csv.gz"))